# SSI_coding.R 

# Part of the replication archive for 
#
#   Bullock, John G., and Kelly Rader. 2021. "Response Options and the 
#   Measurement of Political Knowledge." Forthcoming in the British Journal 
#   of Political Science.
#
# This file codes variables from the study that we describe in our article.


library(Bullock)  # qw()
library(dplyr)    # %>%, coalesce(), left_join()
library(here)     # for here::here()
library(readxl)   # read_excel()
library(tidyr)    # unite()
Recode <- car::Recode

source(here::here("R/functions/eliminateBreakoffs.R"))



# SET CONTRASTS FOR ORDERED FACTORS
# R's default is to use polynomial contrasts for ordered factors.  For ease 
# of interpretation, we switch to treatment contrasts.  
options(contrasts = c(unordered = "contr.treatment", ordered = "contr.treatment"))


# IMPORT DATA FROM THE EXPERIMENT
# When exporting data from Qualtrics, we use the "legacy" format, because the 
# new format does not include randomization ("display order") variables that 
# tell us, for example, whether subjects were assigned to the easy or hard 
# version of a question.  An additional benefit is that question numbering in
# the dataset matches the question numbering in our Qualtrics study.  
#   Other notes on Qualtrics export:
#   --Check "legacy View Results format"
#   --Check "Use choice text"
#   --Check "Use question numbers"
#   --Check "Recode seen but unanswered questions as -99"
#   --Check "Export viewing order data" 
originalData <- read.csv(
  here::here("data/2017-03-30_SSI_knowledgeAndResponseOptions.csv"),  
  stringsAsFactors = TRUE)                                                    


# INITIAL DATA PROCESSING
questionKey  <- originalData[1,] %>% sapply(., as.character)  
originalData <- originalData[-1, ]



# **************************************************************************
# ELIMINATE CASES CREATED BY THE AUTHORS ####
# **************************************************************************
host <- as.character(originalData$V6)
IP_toRemove <- '70.117.88.178'
originalData <- originalData[-grep(IP_toRemove, host), ]
rm(host, IP_toRemove)



# **************************************************************************
# GENERATE A LIST OF BREAK-OFFS ####
# **************************************************************************
# This code doesn't remove any break-offs. It just adds a "breakOffs"
# column to originalData, indicating whether each case is or isn't a 
# break-off.  
originalData <- eliminateBreakoffs(originalData, remove = FALSE)



# **************************************************************************
# ELIMINATE DUPLICATE RECORDS ####
# **************************************************************************
# SSI provides a unique ID, "PSID", for each user. Some of the records in 
# originalData share the same PSID_IP -- that is, the same PSID and the same 
# IP. We want to keep only one record per PSID_IP. 
# That is, we want to ensure that PSID_IP is never duplicated. 
#   To eliminate records, we follow this rule. First, within each PSID_IP 
# set, eliminate any records that are AAPOR breakoffs. Doing this leaves
# only 11 pairs of records that share a PSID_IP. For these 11 pairs, we 
# discard the first record, keeping the second.

# Get PSIDs of cases that (a) have duplicate PSID and IP, and (b) are 
# breakoffs. Then eliminate these cases.
originalData <- originalData %>%
  unite(psid_IP, c("psid", "V6"), remove = FALSE) %>%
  mutate(ID = 1:nrow(originalData)) 

duplicateBreakoffs <- originalData %>%   
  group_by(psid_IP) %>%
  
  # Keep only the pairs or sets of records that have duplicate PSID and IP.
  filter(n() >= 2) %>%  

  # Get PSIDs of breakoffs. These are cases that we'll eliminate.
  filter(breakOffs) %>%
  pull(ID)

originalData <- originalData %>%
  filter(! ID %in% duplicateBreakoffs) 


# For remaining sets of duplicate records, keep the last record and discard
# the previous one.
originalData <- originalData %>%
  mutate(dupe = duplicated(originalData$psid_IP, fromLast = TRUE)) %>%
  filter(!dupe)


# Clean up
rm(duplicateBreakoffs)
originalData <- originalData %>% select(-ID, -dupe)



# **************************************************************************
# ELIMINATE SUBJECTS WHO DIDN'T AGREE TO STAY AWAY FROM OUTSIDE SOURCES ####
# **************************************************************************
# Per page 7 of our pre-analysis plan, we exclude all of these subjects 
# from our main analyses. 
if (!exists("ELIMINATE_CHEATERS") || ELIMINATE_CHEATERS) {
  agreeNotToUseOutsideSources <- Recode(
    var     = droplevels(originalData$Q3.1), 
    recodes = ' ""="no recorded answer" ') == 'Yes'
  originalData <- originalData[agreeNotToUseOutsideSources,]
}



# **************************************************************************
# ELIMINATE AAPOR "BREAK-OFF" SUBJECTS ####
# **************************************************************************
# Per page 7 of our pre-analysis plan, we eliminate these subjects from our
# analyses.
if (!exists("ELIMINATE_BREAKOFFS") || ELIMINATE_BREAKOFFS) {
  originalData <- eliminateBreakoffs(originalData, remove = TRUE)
}



# **************************************************************************
# NUMBER-OF-RESPONSE-OPTIONS MANIPULATIONS ####
# **************************************************************************
processDisplayOptionVars <- function (fac) {
  stopifnot('factor' %in% class(fac))
  
  # Code blank entries as NA and drop unused factor level   
  fac[fac == ''] <- NA
  fac <- droplevels(fac)
  
  # Convert the factor to a list with a character vector in each slot
  fac <- as.character(fac)
  fac <- sub('^\\s+', '', fac)  # remove leading white space
  fac <- sub(';$', '', fac)     # remove trailing semicolon
  fac <- strsplit(fac, split = ';\\s*')  
  return(fac)
}

ChiefJusticeDisplayedOptions             <- processDisplayOptionVars(originalData$ChiefJusticeDisplayedOptions)
SenMajLeaderDisplayedOptions             <- processDisplayOptionVars(originalData$SenMajLeaderDisplayedOptions)
HowManyJusticesCurrentlyDisplayedOptions <- processDisplayOptionVars(originalData$HowManyJusticesCurrentlyDisplayedOptions)
JusticesChosenDisplayedOptions           <- processDisplayOptionVars(originalData$SupremeCourtJusticesChosenDisplayedOptions)
WatchLawyersArgueDisplayedOptions        <- processDisplayOptionVars(originalData$WatchLawyersArgueDisplayedOptions)
ConflictOverMeaningDisplayedOptions      <- processDisplayOptionVars(originalData$ConflictOverMeaningDisplayedOptions)
HowManyJusticesUsuallyDisplayedOptions   <- processDisplayOptionVars(originalData$HowManyJusticesUsuallyDisplayedOptions)
IfJusticesSplitDisplayedOptions          <- processDisplayOptionVars(originalData$IfJusticesSplitDisplayedOptions)
HowManyWomenDisplayedOptions             <- processDisplayOptionVars(originalData$HowManyWomenDisplayedOptions)
CourtPowerDescriptionDisplayedOptions    <- processDisplayOptionVars(originalData$CourtPowerDescriptionDisplayedOptions)
TermLengthDisplayedOptions               <- processDisplayOptionVars(originalData$TermLengthDisplayedOptions)
JusticeRemovalDisplayedOptions           <- processDisplayOptionVars(originalData$JusticeRemovalDisplayedOptions)


# NUMBER-OF-RESPONSE-OPTION VARIABLES
# By default, these variables would range from 4 to 6, because Qualtrics  
# counts "don't know" as a response option.  We subtract 1 from each value,  
# so that the variables indicate the number of -substantive- response   
# options that were presented.  That is, we are not counting "don't know" as   
# a response option.
ChiefJustice_numROs             <- Recode(sapply(ChiefJusticeDisplayedOptions, length)-1,             "0=NA")              
SenMajLeader_numROs             <- Recode(sapply(SenMajLeaderDisplayedOptions, length)-1,             "0=NA")              
HowManyJusticesCurrently_numROs <- Recode(sapply(HowManyJusticesCurrentlyDisplayedOptions, length)-1, "0=NA")  
JusticesChosen_numROs           <- Recode(sapply(JusticesChosenDisplayedOptions, length)-1,           "0=NA")
WatchLawyersArgue_numROs        <- Recode(sapply(WatchLawyersArgueDisplayedOptions, length)-1,        "0=NA")         
ConflictOverMeaning_numROs      <- Recode(sapply(ConflictOverMeaningDisplayedOptions, length)-1,      "0=NA")       
HowManyJusticesUsually_numROs   <- Recode(sapply(HowManyJusticesUsuallyDisplayedOptions, length)-1,   "0=NA")    
IfJusticesSplit_numROs          <- Recode(sapply(IfJusticesSplitDisplayedOptions, length)-1,          "0=NA")           
HowManyWomen_numROs             <- Recode(sapply(HowManyWomenDisplayedOptions, length)-1,             "0=NA")              
CourtPowerDescription_numROs    <- Recode(sapply(CourtPowerDescriptionDisplayedOptions, length)-1,    "0=NA")     
TermLength_numROs               <- Recode(sapply(TermLengthDisplayedOptions, length)-1,               "0=NA")                
JusticeRemoval_numROs           <- Recode(sapply(JusticeRemovalDisplayedOptions, length)-1,           "0=NA")            



# **************************************************************************
# MULTIPLE-CHOICE QUESTIONS (AND OPEN-ENDED VARIANTS) ####
# **************************************************************************

# HOW MANY JUSTICES CURRENTLY SERVE?
# In Qualtrics, we created a (uniform) random variable ranging from 0 to 1:
# "HowManyJusticesCurrentlySeeOpenEnded." If it was >= .75 for a given 
# subject, we assigned that subject to the open-ended condition.  
HowManyJusticesCurrentlySeeOpenEnded <- as.numeric(as.character(originalData$HowManyJusticesCurrentlySeeOpenEnded))
assignedToHowManyJusticesCurrentlyOE <- HowManyJusticesCurrentlySeeOpenEnded >= .75

# In this code block, "diff" stands for "difficulty."
HowManyJusticesCurrently_diff <- droplevels(originalData$DO.BL.Howmanyjusticescurrentlyserve.)
HowManyJusticesCurrently_diff <- Recode(
  var     = HowManyJusticesCurrently_diff,
  recodes = '""=NA; "Q4.1|Q4.3"="hard"; "Q4.2|Q4.3"="easy"')
levels(HowManyJusticesCurrently_diff) <- c(levels(HowManyJusticesCurrently_diff), 'OE') 
HowManyJusticesCurrently_diff[assignedToHowManyJusticesCurrentlyOE] <- 'OE'

HowManyJusticesCurrently_hard <- Recode(originalData$Q4.1, '""=NA; -99="Skipped"')
HowManyJusticesCurrently_hard <- droplevels(HowManyJusticesCurrently_hard)
HowManyJusticesCurrently_easy <- Recode(originalData$Q4.2, '""=NA; -99="Skipped"')
HowManyJusticesCurrently_easy <- droplevels(HowManyJusticesCurrently_easy)
HowManyJusticesCurrently_CE      <- coalesce(
  HowManyJusticesCurrently_hard,
  HowManyJusticesCurrently_easy)
HowManyJusticesCurrently_CE_correct <- HowManyJusticesCurrently_CE=='8'

HowManyJusticesCurrently_OE <- as.character(originalData$Q104)
HowManyJusticesCurrently_OE[!assignedToHowManyJusticesCurrentlyOE] <- NA

# If people had dropped out of the survey by the previous closed-ended 
# question, code them as NA for this question.  [2020 01 16] 
HowManyJusticesCurrently_OE[originalData$Q123 == ''] <- NA

HowManyJusticesCurrently_OE_correct <- HowManyJusticesCurrently_OE
HowManyJusticesCurrently_OE_correct <- grepl(
  pattern     = '8|eight', 
  x           = HowManyJusticesCurrently_OE, 
  ignore.case = TRUE)
HowManyJusticesCurrently_OE_correct[is.na(HowManyJusticesCurrently_OE)] <- NA

HowManyJusticesCurrently_correct <- HowManyJusticesCurrently_CE_correct
HowManyJusticesCurrently_correct[is.na(HowManyJusticesCurrently_correct)] <- 
  HowManyJusticesCurrently_OE_correct[is.na(HowManyJusticesCurrently_correct)]

# Make a single variable that indicates all assignments
HowManyJusticesCurrently_condition <- factor(HowManyJusticesCurrently_numROs):HowManyJusticesCurrently_diff
HowManyJusticesCurrently_condition <- factor(
  x      = HowManyJusticesCurrently_condition, 
  levels = qw("3:easy 3:hard 5:easy 5:hard OE"))  
HowManyJusticesCurrently_condition[assignedToHowManyJusticesCurrentlyOE] <- 'OE'



# HOW MANY JUSTICES USUALLY SERVE?
# In Qualtrics, we created a (uniform) random variable ranging from 0 to 1:
# "HowManyJusticesUsuallySeeOpenEnded." If it was >= .75 for a given 
# subject, we assigned that subject to the open-ended condition.  
HowManyJusticesUsuallySeeOpenEnded <- as.numeric(as.character(originalData$HowManyJusticesUsuallySeeOpenEnded))
assignedToHowManyJusticesUsuallyOE <- HowManyJusticesUsuallySeeOpenEnded >= .75
HowManyJusticesUsually_diff    <- droplevels(originalData$DO.BL.Howmanyjusticesusuallyserve)
HowManyJusticesUsually_diff    <- Recode(
  var     = HowManyJusticesUsually_diff,
  recodes = '""=NA; "Q14.1|Q14.3"="hard"; "Q14.2|Q14.3"="easy"')
levels(HowManyJusticesUsually_diff) <- c(levels(HowManyJusticesUsually_diff), 'OE') 
HowManyJusticesUsually_diff[assignedToHowManyJusticesUsuallyOE] <- 'OE'

HowManyJusticesUsually_hard    <- Recode(originalData$Q14.1, '""=NA; -99="Skipped"')
HowManyJusticesUsually_hard    <- droplevels(HowManyJusticesUsually_hard)
HowManyJusticesUsually_easy    <- Recode(originalData$Q14.2, '""=NA; -99="Skipped"')
HowManyJusticesUsually_easy    <- droplevels(HowManyJusticesUsually_easy)
HowManyJusticesUsually_CE         <- coalesce(
  HowManyJusticesUsually_hard,
  HowManyJusticesUsually_easy)
HowManyJusticesUsually_CE_correct <- HowManyJusticesUsually_CE == '9'

HowManyJusticesUsually_OE <- as.character(originalData$Q106)
HowManyJusticesUsually_OE[!assignedToHowManyJusticesUsuallyOE] <- NA

# If people had dropped out of the survey by the previous closed-ended 
# question, code them as NA for this question.  [2020 01 16] 
HowManyJusticesUsually_OE[originalData$Q7.1 == ''] <- NA

HowManyJusticesUsually_OE_correct <- HowManyJusticesUsually_OE
HowManyJusticesUsually_OE_correct <- grepl(
  pattern     = '9|nine', 
  x           = HowManyJusticesUsually_OE, 
  ignore.case = TRUE)
HowManyJusticesUsually_OE_correct[is.na(HowManyJusticesUsually_OE)] <- NA

HowManyJusticesUsually_correct <- HowManyJusticesUsually_CE_correct
HowManyJusticesUsually_correct[is.na(HowManyJusticesUsually_correct)] <- 
  HowManyJusticesUsually_OE_correct[is.na(HowManyJusticesUsually_correct)]

# Make a single variable that indicates all assignments
HowManyJusticesUsually_condition <- factor(HowManyJusticesUsually_numROs):HowManyJusticesUsually_diff
HowManyJusticesUsually_condition <- factor(
  x      = HowManyJusticesUsually_condition, 
  levels = qw("3:easy 3:hard 5:easy 5:hard OE"))  
HowManyJusticesUsually_condition[assignedToHowManyJusticesUsuallyOE] <- 'OE'



# HOW MANY WOMEN CURRENTLY SERVE?
HowManyWomenSeeOpenEnded <- as.numeric(as.character(originalData$HowManyWomenSeeOpenEnded))
assignedToHowManyWomenOE <- HowManyWomenSeeOpenEnded >= .75
HowManyWomen_diff <- droplevels(originalData$DO.BL.Howmanywomen.)
HowManyWomen_diff <- Recode(
  var     = HowManyWomen_diff,
  recodes = '""=NA; "Q17.1|Q17.3"="hard"; "Q17.2|Q17.3"="easy"')
levels(HowManyWomen_diff) <- c(levels(HowManyWomen_diff), 'OE') 
HowManyWomen_diff[assignedToHowManyWomenOE] <- 'OE'

HowManyWomen_hard <- Recode(originalData$Q17.1, '""=NA; -99="Skipped"')
HowManyWomen_hard <- droplevels(HowManyWomen_hard)
HowManyWomen_easy <- Recode(originalData$Q17.2, '""=NA; -99="Skipped"')
HowManyWomen_easy <- droplevels(HowManyWomen_easy)
HowManyWomen_CE <- coalesce(HowManyWomen_hard, HowManyWomen_easy)
HowManyWomen_CE_correct   <- HowManyWomen_CE == '3'

HowManyWomen_OE <- as.character(originalData$Q108)
HowManyWomen_OE[!assignedToHowManyWomenOE] <- NA

# If people had dropped out of the survey by the previous closed-ended 
# question, code them as NA for this question.  [2020 01 16] 
HowManyWomen_OE[originalData$Q16.1 == ''] <- NA

HowManyWomen_OE_correct <- HowManyWomen_OE
HowManyWomen_OE_correct <- grepl(
  pattern     = '3|three', 
  x           = HowManyWomen_OE, 
  ignore.case = TRUE)
HowManyWomen_OE_correct[is.na(HowManyWomen_OE)] <- NA

HowManyWomen_correct <- HowManyWomen_CE_correct
HowManyWomen_correct[is.na(HowManyWomen_correct)] <- 
  HowManyWomen_OE_correct[is.na(HowManyWomen_correct)]

# Make a single variable that indicates all assignments
HowManyWomen_condition <- factor(HowManyWomen_numROs):HowManyWomen_diff
HowManyWomen_condition <- factor(
  x      = HowManyWomen_condition, 
  levels = qw("3:easy 3:hard 5:easy 5:hard OE"))  
HowManyWomen_condition[assignedToHowManyWomenOE] <- 'OE'


# HOW ARE SUPREME COURT JUSTICES CHOSEN?
ProcedureForChoosingJustices         <- Recode(originalData$Q5.1, '""=NA') 
ProcedureForChoosingJustices_correct <- ProcedureForChoosingJustices == 'Nominated by the President, then confirmed by the Senate'


# CAN YOU WATCH LAWYERS ARGUE BEFORE THE COURT?
WatchLawyersArgue         <- Recode(originalData$Q6.1, '""=NA') 
WatchLawyersArgue_correct <- WatchLawyersArgue == 'Yes, if one goes to Washington, D.C. to see the Court in person.'


# FINAL SAY OVER CONSTITUTIONALITY
FinalSay         <- Recode(originalData$Q7.1, '""=NA') 
FinalSay_correct <- FinalSay == 'Supreme Court'


# TIE-VOTE PROCEDURE
TieVoteProcedure         <- Recode(originalData$Q16.1, '""=NA') 
TieVoteProcedure_correct <- TieVoteProcedure == 'The decision of a lower court stands'


# TERM LENGTH OF SUPREME COURT JUSTICES
TermLengthSeeOpenEnded <- as.numeric(as.character(originalData$TermLengthSeeOpenEnded))
assignedToTermLengthOE <- TermLengthSeeOpenEnded >= .75
TermLength_diff    <- droplevels(originalData$DO.BL.Termlength)
TermLength_diff    <- Recode(
  var     = TermLength_diff,
  recodes = '""=NA; "Q20.1"="hard"; "Q20.2"="easy"')
levels(TermLength_diff) <- c(levels(TermLength_diff), 'OE') 
TermLength_diff[assignedToTermLengthOE] <- 'OE'

TermLength_hard       <- Recode(originalData$Q20.1, '""=NA; -99="Skipped"')
TermLength_hard       <- droplevels(TermLength_hard)
TermLength_easy       <- Recode(originalData$Q20.2, '""=NA; -99="Skipped"')
TermLength_easy       <- droplevels(TermLength_easy)
TermLength_CE         <- coalesce(TermLength_hard, TermLength_easy)
TermLength_CE_correct <- TermLength_CE == 'Life term'

TermLength_OE <- as.character(originalData$Q110)
TermLength_OE[!assignedToTermLengthOE] <- NA

# If people had dropped out of the survey by the previous closed-ended 
# question, code them as NA for this question.  [2020 01 16] 
TermLength_OE[originalData$Q101 == ''] <- NA

TermLength_OE_correct <- TermLength_OE
TermLength_OE_correct <- grepl(
  pattern     = 'death|for\\s?ever|indefini?te|life|long.*(?:want|choose|serve)|no limit|resign|retire', 
  x           = TermLength_OE, 
  ignore.case = TRUE)
TermLength_OE_correct[is.na(TermLength_OE)] <- NA

TermLength_correct <- TermLength_CE_correct
TermLength_correct[is.na(TermLength_correct)] <- 
  TermLength_OE_correct[is.na(TermLength_correct)]

# Make a single variable that indicates all assignments
TermLength_condition <- factor(TermLength_numROs):TermLength_diff
TermLength_condition <- factor(
  x      = TermLength_condition, 
  levels = qw("3:easy 3:hard 5:easy 5:hard OE"))  
TermLength_condition[assignedToTermLengthOE] <- 'OE'


# WHEN CAN THE COURT ACT?
CourtPower         <- Recode(originalData$Q21.1, '""=NA') 
CourtPower_correct <- CourtPower == 'It can interpret laws only to settle disputes between two parties in a legal case'


# CAN JUSTICES EVER BE REMOVED FROM OFFICE?
JusticeRemoval         <- Recode(originalData$Q22.1, '""=NA') 
JusticeRemoval_correct <- JusticeRemoval == 'Yes, if they are impeached by the House of Representatives and convicted by the Senate'


# WHO IS THE SENATE MAJORITY LEADER?
SenMajLeaderSeeOpenEnded <- as.numeric(as.character(originalData$SenMajLeaderSeeOpenEnded))
assignedToSenMajLeaderOE <- SenMajLeaderSeeOpenEnded >= .75
SenMajLeader_diff <- droplevels(originalData$DO.BL.SenateMajorityLeader.Closed.Ended)
SenMajLeader_diff <- Recode(
  var     = SenMajLeader_diff,
  recodes = '""=NA; "Q11.1|Q11.3"="easy"; "Q11.2|Q11.3"="hard"')
levels(SenMajLeader_diff) <- c(levels(SenMajLeader_diff), 'OE') 
SenMajLeader_diff[assignedToSenMajLeaderOE] <- 'OE'

SenMajLeader_hard <- Recode(originalData$Q11.2, '""=NA; -99="Skipped"')
SenMajLeader_hard <- droplevels(SenMajLeader_hard)
SenMajLeader_easy <- Recode(originalData$Q11.1, '""=NA; -99="Skipped"')
SenMajLeader_easy <- droplevels(SenMajLeader_easy)
SenMajLeader_CE   <- coalesce(SenMajLeader_hard, SenMajLeader_easy)
SenMajLeader_CE_correct <- SenMajLeader_CE == 'Mitch McConnell' 

SenMajLeader_OE <- as.character(originalData$Q13.1)
SenMajLeader_OE[!assignedToSenMajLeaderOE] <- NA

# If people had dropped out of the survey by the previous closed-ended 
# question, code them as NA for this question.  [2020 01 16] 
SenMajLeader_OE[originalData$Q7.1 == ''] <- NA

SenMajLeader_OE_correct <- SenMajLeader_OE
SenMajLeader_OE_correct <- grepl(
  pattern     = 'Mit?ch|Mc?\\s?C?onn?ell?',  
  x           = SenMajLeader_OE, 
  ignore.case = TRUE)
SenMajLeader_OE_correct[is.na(SenMajLeader_OE)] <- NA
SenMajLeader_OE_correct[grepl('mcconall', SenMajLeader_OE)]                       <- TRUE
SenMajLeader_OE_correct[grepl('Mitch McConnell And Harry Reid', SenMajLeader_OE)] <- FALSE

SenMajLeader_correct <- SenMajLeader_CE_correct
SenMajLeader_correct[is.na(SenMajLeader_correct)] <- 
  SenMajLeader_OE_correct[is.na(SenMajLeader_correct)]

# Make a single variable that indicates all assignments
SenMajLeader_condition <- factor(SenMajLeader_numROs):SenMajLeader_diff
SenMajLeader_condition <- factor(
  x      = SenMajLeader_condition, 
  levels = qw("3:easy 3:hard 5:easy 5:hard OE"))  # remove unused categories, e.g., "3:OE"
SenMajLeader_condition[assignedToSenMajLeaderOE] <- 'OE'


# WHO IS THE CHIEF JUSTICE?
ChiefJusticeSeeOpenEnded        <- as.numeric(as.character(originalData$ChiefJusticeSeeOpenEnded))
ChiefJusticeSeeSuperEasy        <- as.numeric(as.character(originalData$ChiefJusticeSeeSuperEasy))
assignedToChiefJusticeOE        <- ChiefJusticeSeeOpenEnded >= .75
assignedToChiefJusticeSuperEasy <- !assignedToChiefJusticeOE & (ChiefJusticeSeeSuperEasy <= .0667)
ChiefJustice_diff <- droplevels(originalData$DO.BL.ChiefJustice.Closed.Ended)
ChiefJustice_diff <- Recode(
  var     = ChiefJustice_diff,
  recodes = '""=NA; "Q8.1|Q8.4"="easy"; "Q8.2|Q8.4"="hard"')
levels(ChiefJustice_diff) <- c(levels(ChiefJustice_diff), 'OE', 'superEasy') 
ChiefJustice_diff[assignedToChiefJusticeOE]        <- 'OE'
ChiefJustice_diff[assignedToChiefJusticeSuperEasy] <- 'superEasy'

ChiefJustice_hard       <- Recode(originalData$Q8.2, '""=NA; -99="Skipped"')
ChiefJustice_hard       <- droplevels(ChiefJustice_hard)
ChiefJustice_easy       <- Recode(originalData$Q8.1, '""=NA; -99="Skipped"')
ChiefJustice_easy       <- droplevels(ChiefJustice_easy)
ChiefJustice_superEasy  <- Recode(originalData$Q8.3, '""=NA; -99="Skipped"')
ChiefJustice_superEasy  <- droplevels(ChiefJustice_superEasy)
ChiefJustice_CE         <- coalesce(
  ChiefJustice_hard, 
  ChiefJustice_easy, 
  ChiefJustice_superEasy)
ChiefJustice_CE_correct <- ChiefJustice_CE == 'John Roberts' 

ChiefJustice_OE   <- as.character(originalData$Q10.1)
ChiefJustice_OE[!assignedToChiefJusticeOE] <- NA

# If people had dropped out of the survey by the previous closed-ended 
# question, code them as NA for this question.  [2020 01 16] 
ChiefJustice_OE[originalData$Q7.1 == ''] <- NA

ChiefJustice_OE_correct <- ChiefJustice_OE
ChiefJustice_OE_correct <- grepl(
  pattern     = 'Roberts', 
  x           = ChiefJustice_OE, 
  ignore.case = TRUE)
ChiefJustice_OE_correct[grepl('rpberts', ChiefJustice_OE)] <- TRUE
ChiefJustice_OE_correct[is.na(ChiefJustice_OE)] <- NA

ChiefJustice_correct <- ChiefJustice_CE_correct
ChiefJustice_correct[is.na(ChiefJustice_correct)] <- 
  ChiefJustice_OE_correct[is.na(ChiefJustice_correct)]

# Make a single variable that indicates all assignments
ChiefJustice_condition <- factor(ChiefJustice_numROs):ChiefJustice_diff
ChiefJustice_condition <- factor(
  x      = ChiefJustice_condition, 
  levels = qw("3:superEasy 3:easy 3:hard 5:superEasy 5:easy 5:hard OE"))
ChiefJustice_condition[assignedToChiefJusticeOE] <- 'OE'



# **************************************************************************
# CREATE BINARY ("TF") VERSIONS OF THE "LEVELS OF DIFFICULTY" VARIABLES ####
# **************************************************************************
# All six questions for which we manipulated difficulty had "easy" and "hard" 
# conditions as well as an open-ended condition. The what-is-the-name-of-the-
# Chief-Justice question also had "super-easy" conditions. For all six of
# these questions, there is a corresponding "diff" variable (e.g., 
# "SenMajLeader_diff") that indicates the version to which a subject was 
# assigned (super-easy, easy, hard, or open-ended).  But for some analyses, it 
# will help to have binary versions of these variables in which "super-easy" 
# and "open-ended" are coded as NA. These binary variables are "diffTF" 
# variables: "ChiefJustice_diffTF", etc.  [2017 01 19, 2020 01 14]
diff_names <- qw(
  "ChiefJustice_diff 
  SenMajLeader_diff 
  HowManyJusticesCurrently_diff 
  HowManyJusticesUsually_diff 
  HowManyWomen_diff
  TermLength_diff")
for (i in diff_names) {
  tmp <- get(i)
  tmp <- Recode(tmp, 'c("superEasy", "OE") = NA')
  assign(paste0(i, 'TF'), value = tmp)
}



# **************************************************************************
# DEMOGRAPHIC AND BACKGROUND VARIABLES ####
# **************************************************************************

# AGE
# The survey was fielded in 2017, but age is defined below as 
# "2016 - yearOfBirth." Some who were truly 18 may therefore appear, in our 
# age variable, as only 17 years old. Because these subjects may well be 18, 
# we do not treat their responses as missing data. See "Missing Covariate 
# Values" in our pre-analysis plan for details. 
yearOfBirth <- suppressWarnings(as.integer(as.character(originalData$Q25.2)))
yearOfBirth[yearOfBirth < 1916] <- NA
yearOfBirth[yearOfBirth > 1999] <- NA
age <- 2016 - yearOfBirth


# EDUCATION
educ <- Recode(droplevels(originalData$Q109), 'c("", -99) = "missing"')
educ <- ordered(
  x      = educ, 
  levels = c(
    'No formal schooling', '1st grade', '2nd grade', '3rd grade', '4th grade', 
    '5th grade', '6th grade', '7th grade', '8th grade', '9th grade', 
    '10th grade', '11th grade', '12th grade, no GED or diploma', 
    'Graduate equivalence degree', '12th grade, diploma', 
    'Some college, no degree', 'Associate\'s degree', 'Bachelor\'s degree', 
    'Master\'s degree', 'Other post-college degree'),
  labels = qw(
    "None 1st 2nd 3rd 4th 5th 6th 7th 8th 9th 10th 11th 12th_noDiploma
     GED 12thDiploma someCollege Associate's Bachelor's Master's 
     otherPostCollege"))
  

# GENDER
recodeGender <- function(x) {
  x <- sub('^\\s*?f.*',     'female', x, perl = TRUE, ignore.case = TRUE)
  x <- sub('^\\s*?woman.*', 'female', x, perl = TRUE, ignore.case = TRUE)
  x <- sub('^\\s*?m.*',       'male', x, perl = TRUE, ignore.case = TRUE)
} 
gender <- recodeGender(as.character(originalData$Q25.4))
female <- Recode(gender, '"female"="female"; "male"="male"; else=NA') == 'female'


# PARTY ID
PID_stem      <- Recode(originalData$Q24.2, 'c(-99, "") = NA')
PID_stem[grepl('Generally speaking', PID_stem)] <- NA   
PID_stem      <- droplevels(PID_stem)

PID_branchDem <- Recode(originalData$Q24.3, 'c(-99, "") = NA')
PID_branchDem[grepl('Would you', PID_branchDem)] <- NA  
PID_branchDem <- droplevels(PID_branchDem)

PID_branchRep <- Recode(originalData$Q24.4, 'c(-99, "") = NA')
PID_branchRep[grepl('Would you', PID_branchRep)] <- NA  
PID_branchRep <- droplevels(PID_branchRep)

PID_branchInd <- Recode(originalData$Q24.5, 'c(-99, "") = NA')
PID_branchInd[grepl('Do you think of yourself', PID_branchInd)] <- NA  
PID_branchInd <- droplevels(PID_branchInd)

PID           <- Recode(PID_stem, '"Democrat"=2; "Republican"=6; NA=NA; else=4', as.factor=FALSE)
PID[PID_branchDem=='A strong Democrat']   <- 1
PID[PID_branchRep=='A strong Republican'] <- 7
PID[PID_branchInd=='Democratic Party']    <- 3
PID[PID_branchInd=='Republican Party']    <- 5

PID     <- ordered(PID, labels = qw("SD D LD I LR R SR"))


# POLITICAL KNOWLEDGE (GENERAL)
Pence      <- grepl('v\\.?p\\.?|vi[cstv]?e|vs president|vic president', originalData$Q113, ignore.case = TRUE)
Yellen     <- originalData$Q94 == 'Chair of the Federal Reserve System'
SenateTerm <- originalData$Q95 == '6 years'
generalKnowledge <- Pence + Yellen + SenateTerm


# RACE -- FROM OPEN-ENDED QUESTION
raceOE <- as.character(originalData$Q25.3)
race <- tolower(raceOE)
race <- gsub('(.*)\\s+$', '\\1', race)  # eliminate trailing spaces

# Code the "missing" category. Note that "american" is coded as missing only 
# if it is the entire description. ("African American" is not coded as 
# missing.)
raceIsANumber <- race %>%
  { suppressWarnings(as.numeric(.)) } %>%
  { !is.na(.) }
race[raceIsANumber] <- 'missing'  # Code response to the race question like "7" and "83" as missing data.
race <- Recode(race, ' ""="missing" ')
race <- Recode(race, 'qw("amarican american c california cuastico demercrate engineering fdgbfdgfdsg first gfdgfd good hbkgclyfyguhj hiss human jhghgdd l33 looking none other republican shite unknown usa vcxvcx very verygood w w5he whne x")="missing"')
race <- Recode(race, 'c("don\'t know", "environmental engineering", "eurasian", "human race", "los angeles", "n a", "native hue man", "not sure", "prefer not to say", "the only")="missing"')
rm(raceIsANumber)

# Code the "multiracial" category
race <- gsub('.*(?:bi|mix|mukltracial|multi|part|/).*', 'multiracial', race)
race <- Recode(race, '"east asian, white"="multiracial"')
race <- Recode(race, '"scots-irish cherokee"="multiracial"')
race <- Recode(race, '"white & american indian"="multiracial"')
race <- Recode(race, '"white\\\\native american"="multiracial"')
race <- Recode(race, '"white/native american"="multiracial"')

# Code white
race <- gsub("blanc[ao]|whitetem|whiteem", "white", race)
race <- gsub('.*cau.*', 'white', race)
race <- gsub('.*of western european descent.*', 'white', race)
race <- gsub('w[ihru]?[ar]?[ih][lty]?h?[erw][ert]?m?', 'white', race)
race <- Recode(race, 'c("anglo-saxon", "cacusaian", "euro-american", "european", "european american", "greek", "scandinavian-american", "swiss", "white american", "white non-hispanic", "who te", "wtine")="white"')
race <- gsub("whiteem", "white", race)  # Must be done a second time to catch one difficult case

# Code black
race <- gsub('.*(?:aa|afra?ici?an|afro|blac[jk]|blk|nigga).*', 'black', race)

# Code Hispanic or Latino
race <- gsub('.*(?:espanic|hispacni|hispanic|hispan[oy]?|latin[ao]?|mexican|peruvian|puerto\\s?rican|puerto rico).*', 'HispanicOrLatino', race)
race <- Recode(race, '"mexican american"="HispanicOrLatino"')
race <- Recode(race, '"white and hispanic"="HispanicOrLatino"')
race <- Recode(race, '"white, hispanic"="HispanicOrLatino"')

# Code Asian or Pacific islander
race <- gsub('.*(?:asian?|asain|cambodian|chinese|filipino|hawaiian|indian|japanese|korean|pacific).*', 'asian', race)
race <- Recode(race, '"pacific islander"="asian"')

# Collapse racial categories that have too few respondents to be used in 
# regressions by themselves.
race <- Recode(race, ' c("native american", "persian")="other" ')


# RACE -- SSI
dataSSI <- read_excel(here::here('data/2017-03-30_SSI_knowledgeAndResponseOptions_respondentInfo.xlsx')) %>%
  rename(psid = IdParameter, raceSSI = `Race_Ethnicity US`)
raceCompareDF <- tibble(psid = originalData$psid, raceOE, raceCoded = race) %>%
  left_join(x = ., y = dataSSI, by = 'psid')
raceSSI <- pull(raceCompareDF, raceSSI)


# RACE -- HYBRID VARIABLE
raceCoalesced <- coalesce(raceCompareDF$raceSSI, raceCompareDF$raceCoded) %>%
  Recode(' "asian"="Asian"; "black"="African American"; "HispanicOrLatino"="Hispanic"; "other"="Other"; "white"="White"; "American Indian or Alaska Native"="Other"; "missing"=NA ')


# STATE
# We code state as "missing" so that we can include it as a factor level when 
# we use state fixed effects in our regression analysies.  [2020 01 16]
state <- Recode(droplevels(originalData$Q112), ' c("-99", "")="missing" ')


# IP ADDRESS
IP_index <- grep('IP', questionKey)
IP <- as.character(originalData[, IP_index])



# **************************************************************************
# DURATION MEASURE AND ATTENTION QUESTIONS ####
# **************************************************************************
# Qualtrics automatically exports a variable called "Finished," which equals 
# 1 for respondents who finished the survey, 0 otherwise. In this dataset,  
# it is variable V10.  We can verify this by running 
# grep('Finished', questionKey, value = TRUE).
finished <- droplevels(originalData$V10) == 1

# SCREENER QUESTIONS
# We used two screeners. The "passedScreenerMedia" question appeared near the 
# start of the survey and asked subjects to skip the question instead of  
# choosing their most trusted news source from a list of options. The 
# "passedScreenerSalary" question asked subjects to choose "Don't know"  
# instead of guessing Justices' salaries.  [2017 02 20]
#   The use of multiple screeners was motivated partly by Berinsky, Margolis,
# and Sances (2014).
passedScreenerMedia  <- Recode(originalData$Q123, '""=NA') == '-99'
passedScreenerSalary <- Recode(originalData$Q101, '""=NA') == 'Don\'t know'
screenerSum          <- passedScreenerMedia + passedScreenerSalary 


# OTHER VARIABLES
# Duration is measured in seconds. Note that it's imperative to convert 
# "Q_TotalDuration" to a character vector before converting it to an integer.
# If we don't do that, the numbers are all screwed up.  [2017 02 20]
duration  <- as.integer(as.character(originalData$Q_TotalDuration))
startTime <- as.POSIXlt(originalData$V8) 
endTime   <- as.POSIXlt(originalData$V9)




# **************************************************************************
# PLACEBO QUESTION ####
# **************************************************************************
# We asked a very difficult placebo question, "What job or political office 
# did Horatio King hold?", to gauge the extent to which people were looking
# up answers. (King was a 19th-century postmaster general.)
placeboCorrect <- grepl('post\\s?m', originalData$Q15.1, ignore.case = TRUE)
placeboCorrect[is.na(originalData$Q15.1)] <- NA



# **************************************************************************
# IMPORT SURVEY WEIGHTS ####
# **************************************************************************
if (file.exists(here::here("data/weights.csv"))) {
  SSI_weights <- read.csv(
      file       = here::here("data/weights.csv"), 
      colClasses = c("character", "numeric"))
  originalData <- left_join(originalData, SSI_weights)
  rm(SSI_weights)
}